library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(wordcloud2)
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

Data import and cleaning

There is an inclusion criteria for our analysis:

We will exclude free games from our analysis. This decision is based on the fact that in free games, in-app purchases often serve as alternatives to traditional game sales. However, since we lack access to data regarding these in-app purchases, including such games in our analysis would be both impractical and imprecise.

We will remove columns that contain URLs, as they do not provide useful information for our analysis.

Additionally, we will filter out variables related to developers and publishers for two reasons: Firstly, we believe it is challenging, if not impossible, to make predictions using these features. Secondly, we assume that brand effects should be reflected through a combination of game features and keywords, thus including most of the brand impact.

Furthermore, we will exclude the average and median playtime within two weeks, these variables is redundant since there are variables for average and median playtime over the game’s lifetime, also these variables are biased toward games with high replay value and does not accurately reflect overall popularity.

Lastly, we won’t include the ‘Full Audio Language’ variable, as some games may not contain dialogue, making it irrelevant for our analysis.

df = read.csv("DATA/games.csv")|>
  janitor::clean_names()|>
  subset(select = -c(dlc_count, about_the_game, reviews, header_image, website, support_url, support_email, metacritic_score, metacritic_url, notes, developers, publishers, screenshots, movies, score_rank, average_playtime_two_weeks, median_playtime_two_weeks, full_audio_languages))|>
  subset(price > 0)

Change the format of release date.

df = df|>
  mutate(release_date = as.Date(release_date, format = "%b %d, %Y"))

EDA

Trend of keywords frequency across different years

It might be informative to visualize the trend change of game genre developed in different years since WOW might be good game at 2000s, while games like Overwatch might be better game in 2020s.

genre_freq_year = df|>
  mutate(year = year(release_date))|>
  separate_rows(genres, sep = ",")|>
  group_by(year, genres)|>
  summarise(n_obs = n())|>
  group_by(year)
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
genre_freq_year_total = df|>
  mutate(year = year(release_date))|>
  separate_rows(genres, sep = ",")|>
  group_by(year)|>
  summarise(n_obs_total = n())
  

genre_freq_year_total|>
  knitr::kable()
year n_obs_total
1997 3
1998 1
1999 2
2000 2
2001 7
2002 2
2003 7
2004 7
2005 10
2006 79
2007 118
2008 230
2009 477
2010 419
2011 477
2012 659
2013 1011
2014 3312
2015 6109
2016 9853
2017 14823
2018 19051
2019 18579
2020 22777
2021 27694
2022 30735
2023 19446
NA 277

We could see from the table that there are very few games on steam before 2006, genre frequency might be strongly affected by randomness in those years, so we choose only to plot the trend after 2005.

#here we limit the year to later than 2002 since steam is lauched in 2002, we think the game before 2002 might be not representative. 
left_join(genre_freq_year, genre_freq_year_total, by = "year")|>
  subset(year > 2005)|>
  mutate(genre_ratio = n_obs/n_obs_total)|>
  ungroup()|>
  plot_ly(x = ~year, y = ~genre_ratio, color = ~genres, type = "scatter", mode = "lines+markers",  colors = "viridis")
popular_genres_bar_plot = left_join(genre_freq_year, genre_freq_year_total, by = "year")|>
  subset(year > 2005)|>
  mutate(genre_ratio = n_obs/n_obs_total)|>
  ggplot(aes(x = year, y = genre_ratio, fill = genres)) + 
  geom_col(position = 'stack', width = 0.6)+
  theme(axis.text.x = element_text(angle = 90, vjust = 1, hjust=1))
popular_genres_bar_plot

Observation 需要再加一下

Word Cloud visualization

Wwordcloud could help visualize the the frequency of keywords in popular/unpopular games.

overwhelm_popular_wordcloud = df|>
  subset((positive+negative) > 500)|>
  subset(positive/(positive+negative) > 0.95)|>
  separate_rows(tags, sep = ",")|>
  group_by(tags)|>
  summarise(n_obs = n())|>
  wordcloud2()

popular_tags_wordcloud = df_popular|>
  separate_rows(tags, sep = ",")|>
  group_by(tags)|>
  summarise(n_obs = n())|>
  wordcloud2()

unpopular_tags_wordcloud = df_unpopular|>
  separate_rows(tags, sep = ",")|>
  group_by(tags)|>
  summarise(n_obs = n())|>
  wordcloud2()
overwhelm_popular_wordcloud
popular_tags_wordcloud
unpopular_tags_wordcloud

We can observe differences among the three word clouds above.

Firstly, casual games are prevalent among popular games but not as common in unpopular games. This could be because casual games attract a larger user base, increasing the likelihood of meeting the popularity criteria.

Simultaneously, we notice that the tag “difficult” appears more frequently in the word cloud for popular games. This aligns with reality, as some of the best games in history, such as Elden Ring and Sekiro: Shadows Die Twice, are known for their challenging gameplay.

An interesting observation is that the ratio of 2D games to 3D games is higher in popular games compared to unpopular ones. This is reasonable since plot and game mechanics often take precedence in game development. Small studios may struggle to allocate sufficient resources to create high-quality 3D environments, prompting them to opt for 2D games more frequently. This contributes to the higher frequency of 2D games in the popular category.

Furthermore, anime and cute are tags that exhibit a higher frequency in popular games. The exact reason for this trend is uncertain, but we plan to conduct a more in-depth analysis of games associated with these tags.

model fitting

Since it is hard for models to directly analyze keywords as string, we one-hot encoded categories, genres, and tags, which facilitate models to analyze the dataset.

df_concat = df|>
  mutate(keywords = paste(df$categories, df$genres, df$tags, sep = ","))|>
  subset(select = -c(categories, genres, tags))
keywords_df = df_concat|>
  subset(select = c(app_id, keywords))|>
  separate_rows(keywords, sep = ",")|>
  distinct(app_id, keywords, .keep_all = TRUE)|>
  mutate(value = 1)|>
  subset(keywords != "")|>
  pivot_wider(names_from = keywords, values_from = value, values_fill = 0)

one_hot_encoded_df = left_join(df_concat, keywords_df, by = "app_id")

popular_encoded_df = one_hot_encoded_df|>
  subset((positive+negative) > 10)|>
  subset(positive/(positive+negative) > 0.8)|>
  subset(estimated_owners != "0 - 20000")|>
  subset(median_playtime_forever > 120)|>
  mutate(popular = 1)

unpopular_encoded_df = anti_join(one_hot_encoded_df, popular_encoded_df, by="app_id")|>
  mutate(popular = 0)

encoded_with_label_df = rbind(popular_encoded_df, unpopular_encoded_df)|>
  subset(select = -c(positive, negative, estimated_owners, median_playtime_forever, average_playtime_forever, recommendations, user_score))